﻿using Common.FrontEnd;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using Util.Props;

namespace FrontEnd.EndPoint
{
    /**
    /// Converts a stream of SpeechClassifiedData objects, marked as speech and
    /// non-speech, and mark out the regions that are considered speech. This is done
    /// by inserting SPEECH_START and SPEECH_END signals into the stream.
    /// <p/>
    /// <p>
    /// The algorithm for inserting the two signals is as follows.
    /// <p/>
    /// <p>
    /// The algorithm is always in one of two states: 'in-speech' and
    /// 'out-of-speech'. If 'out-of-speech', it will read in audio until we hit audio
    /// that is speech. If we have read more than 'startSpeech' amount of
    /// <i>continuous</i> speech, we consider that speech has started, and insert a
    /// SPEECH_START at 'speechLeader' time before speech first started. The state of
    /// the algorithm changes to 'in-speech'.
    /// <p/>
    /// <p>
    /// Now consider the case when the algorithm is in 'in-speech' state. If it read
    /// an audio that is speech, it is scheduled for output. If the audio is non-speech, we read
    /// ahead until we have 'endSilence' amount of <i>continuous</i> non-speech. At
    /// the point we consider that speech has ended. A SPEECH_END signal is inserted
    /// at 'speechTrailer' time after the first non-speech audio. The algorithm
    /// returns to 'out-of-speech' state. If any speech audio is encountered
    /// in-between, the accounting starts all over again.
    /// 
    /// While speech audio is processed delay is lowered to some minimal amount. This helps
    /// to segment both slow speech with visible delays and fast speech when delays are minimal.
     */
    public class SpeechMarker: BaseDataProcessor
    {

        /**
        /// The property for the minimum amount of time in speech (in milliseconds) to be considered
        /// as utterance start.
         */
        [S4Integer(defaultValue = 200)]
        public static String PROP_START_SPEECH = "startSpeech";
        private int startSpeechTime;


        /**
        /// The property for the amount of time in silence (in milliseconds) to be
        /// considered as utterance end.
         */
        [S4Integer(defaultValue = 500)]
        public static  String PROP_END_SILENCE = "endSilence";
        private int endSilenceTime;


        /**
        /// The property for the amount of time (in milliseconds) before speech start
        /// to be included as speech data.
         */
        [S4Integer(defaultValue = 50)]
        public static String PROP_SPEECH_LEADER = "speechLeader";
        private int speechLeader;

        /**
        /// The property for number of frames to keep in buffer. Should be enough to let
        /// insert the SpeechStartSignal.
         */
        [S4Integer(defaultValue = 30)]
        public static String PROP_SPEECH_LEADER_FRAMES = "speechLeaderFrames";
        private int speechLeaderFrames;

        /**
        /// The property for the amount of time (in milliseconds) after speech ends to be
        /// included as speech data.
         */
        [S4Integer(defaultValue = 50)]
        public static String PROP_SPEECH_TRAILER = "speechTrailer";
        private int speechTrailer;
    
        /**
        /// The property to decrease end silence while we are reading speech. This
        /// allows marker to adapt to the fast speech with small pauses. This 
        /// is relative decrease to speechTrailer per second of speech, so that 
        /// utterance shouldn't be longer than this amount of seconds.
         */    
        [S4Double(defaultValue = 15.0)]
        public static String PROP_END_SILENCE_DECAY = "endSilenceDecay";
        private double endSilenceDecay;

        private LinkedList<IData> outputQueue;  // Audio objects are added to the end
        private Boolean _inSpeech;
        private int frameCount;
        private int initialEndSilenceTime;

        public SpeechMarker(int startSpeechTime, int endSilenceTime, int speechLeader, int speechLeaderFrames, int speechTrailer, double endSilenceDecay) 
        {
            this.startSpeechTime = startSpeechTime;
            this.endSilenceTime = endSilenceTime;
            this.speechLeader = speechLeader;
            this.speechLeaderFrames = speechLeaderFrames;
            this.speechTrailer = speechTrailer;
            this.endSilenceDecay = endSilenceDecay;       
            this.initialEndSilenceTime = endSilenceTime;
        }

        public SpeechMarker() {
        }

        override
        public void newProperties(PropertySheet ps)
        {
            base.newProperties(ps);

            startSpeechTime = ps.getInt(PROP_START_SPEECH);
            endSilenceTime = ps.getInt(PROP_END_SILENCE);
            speechLeader = ps.getInt(PROP_SPEECH_LEADER);
            speechLeaderFrames = ps.getInt(PROP_SPEECH_LEADER_FRAMES);
            speechTrailer = ps.getInt(PROP_SPEECH_TRAILER);
            endSilenceDecay = ps.getDouble(PROP_END_SILENCE_DECAY);
        
            initialEndSilenceTime = endSilenceTime;
        }


        /**
        /// Initializes this SpeechMarker
         */
        public void initialize() 
        {
            base.initialize();
            reset();
        }


        /**
        /// Resets this SpeechMarker to a starting state.
         */
        private void reset() 
        {
            _inSpeech = false;
            frameCount = 0;
            this.outputQueue = new LinkedList<IData>();
        }


        /**
        /// Returns the next Data object.
         *
        /// @return the next Data object, or null if none available
        /// @throws DataProcessingException if a data processing error occurs
         */
        public IData getData()
        {
            while (outputQueue.Count < speechLeaderFrames) 
            {
                IData audio = readData();
            
                if (audio != null) {
                    if (!_inSpeech) {

                        if (audio is SpeechClassifiedData) {
                            SpeechClassifiedData data = (SpeechClassifiedData) audio;

                            sendToQueue(audio);

                            if (data.isSpeech()) {
                                Boolean speechStarted = handleFirstSpeech(data);
                                if (speechStarted) {
                                    addSpeechStart();
                                    _inSpeech = true;
                            	    startCountingFrames();
                                }
                            }
                        } else if (audio is DataStartSignal) {
                            reset();
                            sendToQueue(audio);
                        } else {
                            sendToQueue(audio);
                        }
                    } else {
                        if (audio is SpeechClassifiedData) {
                            SpeechClassifiedData data =
                                    (SpeechClassifiedData) audio;
                            sendToQueue(data);
                            if (!data.isSpeech()) {
                                _inSpeech = !(readEndFrames(data));
                            } else {
                        	    countSpeechFrame();
                            }
                        } 
                        else if (audio is DataEndSignal) 
                        {
                            sendToQueue(new SpeechEndSignal(((Signal) audio).getTime()));
                            sendToQueue(audio);
                            _inSpeech = false;
                        } 
                        else if (audio is DataStartSignal) 
                        {
                            reset();
                            sendToQueue(audio);
                        }
                    }
                } else {
                    break;
                }
            }

            if (outputQueue.Count!=0) 
            {
                IData audio = outputQueue.First.Value;

                if (audio is SpeechClassifiedData) {
                    SpeechClassifiedData data = (SpeechClassifiedData) audio;
                    audio = data.getDoubleData();
                }

                if (audio is DataStartSignal)
                    DataStartSignal.tagAsVadStream((DataStartSignal) audio);

                return audio;
            } else {
                return null;
            }
        }


        private void countSpeechFrame() 
        {
    	    frameCount++;
    	    int minTime = speechLeader + speechTrailer;
    	
   		    endSilenceTime = (int) (initialEndSilenceTime - 
   						           ((float)initialEndSilenceTime - minTime) / endSilenceDecay* 
   						           (frameCount / 100.0));
    	
   		    if (endSilenceTime <= minTime)
    		    endSilenceTime = minTime;
	    }

	    private void startCountingFrames() 
        {
    	    frameCount = 0;
    	    endSilenceTime = initialEndSilenceTime;
	    }

	    private IData readData()
        {
            return getPredecessor().getData();
        }

        private void sendToQueue(IData audio) 
        {
            outputQueue.AddLast(audio);
        }

        /**
        /// Returns the amount of audio data in milliseconds in the given SpeechClassifiedData object.
         *
        /// @param audio the SpeechClassifiedData object
        /// @return the amount of audio data in milliseconds
         */
        public int getAudioTime(SpeechClassifiedData audio) 
        {
            return (int)
                    (audio.getValues().Length* 1000.0f / audio.getSampleRate());
        }


        /**
        /// Handles an SpeechClassifiedData object that can possibly be the first in an utterance.
         *
        /// @param audio the SpeechClassifiedData to handle
        /// @return true if utterance/speech has started for real, false otherwise
        /// @throws edu.cmu.sphinx.frontend.DataProcessingException
         *
         */
        private Boolean handleFirstSpeech(SpeechClassifiedData audio)
        {
            int speechTime = getAudioTime(audio);

            // System.out.println("Entering handleFirstSpeech()");
            // try to read more that 'startSpeechTime' amount of
            // audio that is labeled as speech (the condition for speech start)

            while (speechTime < startSpeechTime) {
                IData next = readData();

                if (next == null) {
                    return false;
                }

                sendToQueue(next);

                if (next is SpeechClassifiedData) {
                    if (!((SpeechClassifiedData) next).isSpeech()) {
                        return false;
                    } else {
                        speechTime += getAudioTime(audio);
                    }
                }
            }
            return true;
        }


        /**
        /// Backtrack from the current position to add a SPEECH_START Signal to the outputQueue.
         */
        private void addSpeechStart() 
        {
            long lastCollectTime = 0;
            int silenceLength = 0, initalSpeechLength = 0;
            outputQueue.Reverse();
            IEnumerator<IData> i = outputQueue.GetEnumerator();

            // backtrack until we have 'speechLeader' amount of non-speech
            while ((silenceLength < speechLeader || initalSpeechLength < startSpeechTime) && i.MoveNext()) {
                IData current = i.Current;
                if (current is SpeechClassifiedData) {
                    SpeechClassifiedData data = (SpeechClassifiedData) current;
                    if (data.isSpeech()) {
                        initalSpeechLength += getAudioTime(data);
                    } else {
                        silenceLength += getAudioTime(data);
                    }
                    lastCollectTime = data.getCollectTime();
                } else if (current is DataStartSignal || current is SpeechEndSignal) {
                    i.MoveNext(); // put the SPEECH_START after the UTTERANCE_START
                    break;
                } else if (current is DataEndSignal) {
                    throw new Exception("Illegal signal " + current);
                }
            }

            // Add the SPEECH_START.
            outputQueue.AddBefore(outputQueue.Find(i.Current), new SpeechStartSignal(lastCollectTime));
            outputQueue.Reverse();
        }


        /**
        /// Given a non-speech frame, try to read more non-speech frames until we think its the end of utterance.
         *
        /// @param audio a non-speech frame
        /// @return true if speech has really ended, false if speech has not ended
        /// @throws edu.cmu.sphinx.frontend.DataProcessingException
         *
         */
        private Boolean readEndFrames(SpeechClassifiedData audio)
        {

            Boolean speechEndAdded = false;
            Boolean readTrailer = true;
            int originalLast = outputQueue.Count - 1;
            int silenceLength = getAudioTime(audio);

            // read ahead until we have 'endSilenceTime' amount of silence
            while (silenceLength < endSilenceTime) 
            {
                IData next = readData();
                if (next is SpeechClassifiedData) 
                {
                    SpeechClassifiedData data = (SpeechClassifiedData) next;
                    sendToQueue(data);
                    if (data.isSpeech()) {
                        // if speech is detected again, we're still in
                        // an utterance
                        return false;
                    } else {
                        // it is non-speech
                        silenceLength += getAudioTime(data);
                    }
                } 
                else if (next is DataEndSignal) 
                {
                    sendToQueue(next);
                    readTrailer = false;
                    break;
                } 
                else if (next is Signal) 
                {
                    throw new Exception("Illegal signal: " + next);
                }
            }

            if (readTrailer) {
                // read ahead until we have 'speechTrailer' amount of silence
                while (!speechEndAdded && silenceLength < speechTrailer) {
                    IData next = readData();
                    if (next is SpeechClassifiedData) {
                        SpeechClassifiedData data = (SpeechClassifiedData) next;
                        if (data.isSpeech()) {
                            // if we have hit speech again, then the current
                            // speech should end
                            sendToQueue(new SpeechEndSignal(data.getCollectTime()));
                            sendToQueue(data);
                            speechEndAdded = true;
                            break;
                        } else {
                            silenceLength += getAudioTime(data);
                            sendToQueue(data);
                        }
                    } else if (next is DataEndSignal) {
                        sendToQueue(new SpeechEndSignal(((Signal) next).getTime()));
                        sendToQueue(next);
                        speechEndAdded = true;
                    } else {
                        throw new Exception("Illegal signal: " + next);
                    }
                }
            }

            if (!speechEndAdded) {
                // iterate from the end of speech and read till we
                // have 'speechTrailer' amount of non-speech, and
                // then add an SPEECH_END
                IEnumerator<IData> i = outputQueue.GetEnumerator();
                long nextCollectTime = 0;

                // the 'firstSampleNumber' of SPEECH_END actually contains
                // the last sample number of the segment
                long lastSampleNumber = 0;
                silenceLength = 0;

                IEnumerator<IData> iPrev = i;
                while (silenceLength < speechTrailer && i.MoveNext()) 
                {
                    IData next = i.Current;
                    if (next is DataEndSignal) 
                    {
                        i = iPrev;
                        break;
                    } 
                    else if (next is SpeechClassifiedData) 
                    {
                        SpeechClassifiedData data = (SpeechClassifiedData) next;
                        nextCollectTime = data.getCollectTime();
                        Trace.Assert(!data.isSpeech());
                        silenceLength += getAudioTime(data);
                        lastSampleNumber = data.getFirstSampleNumber() +
                                data.getValues().Length - 1;
                    }
                    iPrev = i;
                }
                if (speechTrailer > 0) {
                    Trace.Assert(nextCollectTime != 0 && lastSampleNumber != 0);
                }
                outputQueue.AddBefore(outputQueue.Find(i.Current),new SpeechEndSignal(nextCollectTime));
            }
            return true;
        }

        public Boolean inSpeech() 
        {
            return _inSpeech;
        }

    }
}
